Andrew Stewart, Division of Neuroscience and Experimental Psychology.
email: andrew.stewart@manchester.ac.uk
twitter: @ajstewart_lang
First load the tidyverse and gganimate packages.
library(tidyverse)
library(gganimate)
Load the packages we need for our linear mixed models.
library(lme4)
library(lmerTest)
library(emmeans)
Load the data.
data <- read_csv("fpfp0.ixs")
Labelling for each condition is as follows: C1 = Neutral condition C2 = Negative condition C3 = Positive condition
Relabel the Condition variable (“cond”) and set as.factor()
data$cond <- recode (data$cond, "C1"="Neutral", "C2"="Negative", "C3"="Positive")
data$cond <- as.factor(data$cond)
Filter so we only have the Region 3 data - this is our critical analysis region. Ignore any zero reading times - these will be tracker loss/missing data.
data_R3 <- filter (data, reg=="R3" & DV > 0)
data_R3
## # A tibble: 572 x 7
## subj item cond seq meas reg DV
## <chr> <chr> <fct> <int> <chr> <chr> <int>
## 1 S1 I1 Neutral 60 FP R3 867
## 2 S1 I2 Positive 44 FP R3 1061
## 3 S1 I3 Negative 30 FP R3 771
## 4 S1 I4 Neutral 15 FP R3 626
## 5 S1 I5 Positive 55 FP R3 1283
## 6 S1 I6 Negative 18 FP R3 846
## 7 S1 I7 Neutral 32 FP R3 547
## 8 S1 I8 Positive 70 FP R3 1135
## 9 S1 I9 Negative 37 FP R3 1254
## 10 S1 I10 Neutral 1 FP R3 926
## # ... with 562 more rows
Let’s visualise the data for each of the 3 conditions.
ggplot (data_R3, aes(x=DV)) + geom_histogram() + facet_wrap(~cond)
Let’s look at the data on a Participant by Participant basis.
ggplot (data_R3, aes (x=cond, y=DV, colour=cond)) + geom_jitter(width=.1, alpha=.5) +
stat_summary(fun.y=mean, geom="point", colour="black") +
facet_wrap(~subj) +
scale_color_discrete(guide=FALSE) +
labs (y="Reading Time (msec.)", x="Condition",
title="Critical Region, First Pass Reading Times Facted Wrapped by Participant")
And via gganimate.
ggplot (data_R3, aes (x=cond, y=DV, colour=cond)) + geom_jitter(width=.2, alpha=.8, size=3) +
stat_summary(fun.y=mean, geom="point", colour="black", size=5) +
scale_color_discrete(guide=FALSE) +
labs (y="Reading Time (msec.)", x="Condition",
title="Critical Region, First Pass Reading Times \nAnimated by Participant {(closest_state)}") +
theme(text=element_text(size=15)) +
transition_states(subj, transition_length = 1, state_length = 1) +
enter_fade() +
exit_fade()
Let’s work out for how many participants the effect went in the direction as predicted. First we’re aggregating by participants.
data_agg <- data_R3 %>% group_by(subj, cond) %>% summarise (mean=mean(DV), sd=sd(DV))
For how many people was the Negative condition faster than the Positive?
sum (data_agg[data_agg$cond=="Negative",]$mean < data_agg[data_agg$cond=="Positive",]$mean)
## [1] 18
For how many people was the Negative condition faster than the Neutral?
sum (data_agg[data_agg$cond=="Negative",]$mean < data_agg[data_agg$cond=="Neutral",]$mean)
## [1] 19
For how many people were the above both TRUE?
sum (data_agg[data_agg$cond=="Negative",]$mean < data_agg[data_agg$cond=="Positive",]$mean &
data_agg[data_agg$cond=="Negative",]$mean < data_agg[data_agg$cond=="Neutral",]$mean)
## [1] 16
Let’s do the same but this time by Items.
ggplot (data_R3, aes (x=cond, y=DV, colour=cond)) + geom_jitter(width=.1, alpha=.5) +
stat_summary(fun.y=mean, geom="point", colour="black") +
facet_wrap(~item) +
scale_color_discrete(guide=FALSE) +
labs (y="Reading Time (msec.)", x="Condition",
title="Critical Region, First Pass Reading Times Facted Wrapped by Item")
data_agg <- data_R3 %>% group_by(item, cond) %>% summarise (mean=mean(DV), sd=sd(DV))
sum (data_agg[data_agg$cond=="Negative",]$mean < data_agg[data_agg$cond=="Positive",]$mean)
## [1] 17
sum (data_agg[data_agg$cond=="Negative",]$mean < data_agg[data_agg$cond=="Neutral",]$mean)
## [1] 18
Let’s build a raincloud plot.
library(RColorBrewer)
library(plyr) #note, need to detach this after this plot as clashes with aspects of dplyr
source("https://gist.githubusercontent.com/ajstewartlang/6c4cd8ab9e0c27747424acdfb3b4cff6/raw/fb53bd97121f7f9ce947837ef1a4c65a73bffb3f/geom_flat_violin.R")
raincloud_theme = theme(
text = element_text(size = 12),
axis.title.x = element_text(size = 12),
axis.title.y = element_text(size = 12),
axis.text = element_text(size = 12),
axis.text.x = element_text(angle = 45, vjust = 0.5),
legend.title=element_text(size=12),
legend.text=element_text(size=12),
legend.position = "right",
#plot.title = element_text(lineheight=.8, size = 12),
panel.border = element_blank(),
panel.grid.minor = element_blank(),
panel.grid.major = element_blank(),
axis.line.x = element_line(colour = 'black', size=0.5, linetype='solid'),
axis.line.y = element_line(colour = 'black', size=0.5, linetype='solid'))
lb <- function(x) mean(x) - sd(x)
ub <- function(x) mean(x) + sd(x)
sumld <- ddply(data_R3, ~cond, summarise, mean = mean(DV), median = median(DV), lower = lb(DV), upper = ub(DV))
ggplot(data = data_R3, aes(y = DV, x = cond, fill = cond)) +
geom_flat_violin(position = position_nudge(x = .2, y = 0), alpha = .8, trim=FALSE) +
geom_point(aes(y = DV, color = cond), position = position_jitter(width = .15), size = .5, alpha = 0.8) +
geom_boxplot(width = .1, outlier.shape = NA, alpha = 0.5) +
#expand_limits(x = 3) +
guides(fill = FALSE) +
guides(color = FALSE) +
scale_color_brewer(palette = "Accent") +
scale_fill_brewer(palette = "Accent") +
coord_flip() +
theme_bw() +
raincloud_theme +
labs(y="Reading Time (msec.)", x="Condition", title="Critical Region, First Pass Reading Times")
detach("package:plyr", unload=TRUE) #need to detach as some clashes with dplyr
Build our first linear mixed model with maximal random effects structure.
model.null <- lmer(DV ~ (1+cond|subj) + (1+cond|item), data_R3, REML=TRUE)
model <- lmer(DV ~ cond + (1+cond|subj) + (1+cond|item), data_R3, REML=TRUE)
Check the experimental model differs from the Null model.
anova(model.null, model)
## refitting model(s) with ML (instead of REML)
## Data: data_R3
## Models:
## model.null: DV ~ (1 + cond | subj) + (1 + cond | item)
## model: DV ~ cond + (1 + cond | subj) + (1 + cond | item)
## Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq)
## model.null 14 8648.1 8709.0 -4310.0 8620.1
## model 16 8642.3 8711.9 -4305.1 8610.3 9.817 2 0.007384 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Generate a summary of our model parameters plus run some pairwise comparisons.
summary(model)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: DV ~ cond + (1 + cond | subj) + (1 + cond | item)
## Data: data_R3
##
## REML criterion at convergence: 8581.6
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.0879 -0.5173 -0.0208 0.4988 4.2992
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## subj (Intercept) 66595.9 258.06
## condNeutral 2053.1 45.31 1.00
## condPositive 147.5 12.14 -0.32 -0.29
## item (Intercept) 34565.7 185.92
## condNeutral 383.2 19.57 1.00
## condPositive 4833.4 69.52 -1.00 -1.00
## Residual 171365.6 413.96
## Number of obs: 572, groups: subj, 24; item, 24
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 815.42 71.57 32.26 11.393 7.61e-13 ***
## condNeutral 142.03 43.66 143.18 3.253 0.00142 **
## condPositive 104.46 44.81 29.43 2.331 0.02679 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) cndNtr
## condNeutral -0.086
## condPositiv -0.464 0.432
emmeans(model, pairwise~cond, adjust="none")
## $emmeans
## cond emmean SE df lower.CL upper.CL
## Negative 815.4171 71.57948 31.36 669.4985 961.3357
## Neutral 957.4509 80.57012 32.56 793.4451 1121.4566
## Positive 919.8750 64.45063 26.39 787.4909 1052.2591
##
## Degrees-of-freedom method: kenward-roger
## Confidence level used: 0.95
##
## $contrasts
## contrast estimate SE df t.ratio p.value
## Negative - Neutral -142.03374 43.67792 11.27 -3.252 0.0075
## Negative - Positive -104.45787 44.81766 12.18 -2.331 0.0377
## Neutral - Positive 37.57587 47.18139 13.35 0.796 0.4397
Check we’re happy with the model residuals.
qqnorm(residuals(model))
I can live with these…
Now let’s look at the subsequent region of text (Region 4). Exclude missing data/tracker loss points where the DV==0.
data_R4 <- filter (data, reg=="R4" & DV > 0)
Need to load the plyr package again.
library(plyr)
Another Raincloud plot.
sumld <- ddply(data_R4, ~cond, summarise, mean = mean(DV), median = median(DV), lower = lb(DV), upper = ub(DV))
ggplot(data = data_R4, aes(y = DV, x = cond, fill = cond)) +
geom_flat_violin(position = position_nudge(x = .2, y = 0), alpha = .8, trim=FALSE) +
geom_point(aes(y = DV, color = cond), position = position_jitter(width = .15), size = .5, alpha = 0.8) +
geom_boxplot(width = .1, outlier.shape = NA, alpha = 0.5) +
#expand_limits(x = 3) +
guides(fill = FALSE) +
guides(color = FALSE) +
scale_color_brewer(palette = "Accent") +
scale_fill_brewer(palette = "Accent") +
coord_flip() +
theme_bw() +
raincloud_theme +
labs(y="Reading Time (msec.)", x="Condition", title="Post-Critical Region, First Pass Reading Times")
detach("package:plyr", unload=TRUE) #need to detach as some clashes with dplyr
Build the linear mixed model for Region 4 as we did for Region 3.
model.null <- lmer(DV ~ (1|subj) + (1|item), data_R4, REML=TRUE)
model <- lmer(DV ~ cond + (1|subj) + (1|item), data_R4, REML=TRUE)
anova (model.null, model)
## refitting model(s) with ML (instead of REML)
## Data: data_R4
## Models:
## model.null: DV ~ (1 | subj) + (1 | item)
## model: DV ~ cond + (1 | subj) + (1 | item)
## Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq)
## model.null 4 8924.1 8941.5 -4458.1 8916.1
## model 6 8923.8 8949.9 -4455.9 8911.8 4.3149 2 0.1156
summary (model)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: DV ~ cond + (1 | subj) + (1 | item)
## Data: data_R4
##
## REML criterion at convergence: 8881.6
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.5734 -0.5398 -0.1635 0.4043 5.8942
##
## Random effects:
## Groups Name Variance Std.Dev.
## subj (Intercept) 122383 349.8
## item (Intercept) 45746 213.9
## Residual 300455 548.1
## Number of obs: 571, groups: subj, 24; item, 24
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 1395.09 92.77 45.32 15.039 <2e-16 ***
## condNeutral 66.92 56.33 522.13 1.188 0.235
## condPositive -49.05 56.26 522.14 -0.872 0.384
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) cndNtr
## condNeutral -0.306
## condPositiv -0.307 0.505
emmeans (model, pairwise~cond, adjust="none")
## $emmeans
## cond emmean SE df lower.CL upper.CL
## Negative 1395.088 92.76705 45.32 1208.282 1581.894
## Neutral 1462.005 92.62314 45.05 1275.457 1648.552
## Positive 1346.042 92.57551 44.95 1159.580 1532.504
##
## Degrees-of-freedom method: kenward-roger
## Confidence level used: 0.95
##
## $contrasts
## contrast estimate SE df t.ratio p.value
## Negative - Neutral -66.91684 56.33498 522.13 -1.188 0.2354
## Negative - Positive 49.04624 56.26042 522.14 0.872 0.3837
## Neutral - Positive 115.96308 56.02282 522.04 2.070 0.0390
Looking at the first pass data animated by Region.
data1 <- filter(data, reg!="R0" & DV > 0)
ggplot (data1, aes (x=cond, y=DV, colour=cond)) + geom_jitter(width=.2, alpha=.8, size=3) +
geom_boxplot(width = .5, outlier.shape = NA, alpha = 0.3, colour="black") +
scale_color_discrete(guide=FALSE) +
labs (y="Reading Time (msec.)", x="Condition",
title="First Pass Reading Times Animated by Region. \nThe Critical Region is R3. \nCurrent Region is {(closest_state)}") +
theme(text=element_text(size=15)) +
coord_flip() +
transition_states(reg, transition_length = 1, state_length = 1) +
enter_fade() +
exit_fade()
Read in the Regressions Out data.
FPRO <- read_csv("fproro0.ixs")
## Parsed with column specification:
## cols(
## subj = col_character(),
## item = col_character(),
## cond = col_character(),
## seq = col_integer(),
## meas = col_character(),
## reg = col_character(),
## DV = col_integer()
## )
Recode as before.
data_R3 <- filter (FPRO, reg=="R3" & cond != "C0")
data_R3$cond <- recode (data_R3$cond, "C1"="Neutral", "C2"="Negative", "C3"="Positive")
data_R3$cond <- as.factor (data_R3$cond)
Let’s generate some summary data.
data_summ <- data_R3 %>% group_by(cond) %>% summarise(mean=mean(DV), sd=sd(DV))
Build a bar chart.
ggplot (data_summ, aes (x=cond, y=mean, fill=cond)) + geom_col() +
geom_text(aes (label= format(round(mean, 0))), nudge_y = 1) +
scale_fill_discrete (guide=FALSE) + labs (y="Regressions (%)", x="Condition",
title="Regressions Out of Critical Region")
Recode the DV as binomial.
data_R3$DV <- recode (data_R3$DV, "100"=1 ,"0"=0)
Build the null and experimental glmm given sampling from the binomial distribution.
model.null <- glmer (DV ~ (1|subj) + (1|item), data_R3, family=binomial)
model <- glmer (DV ~ cond + (1|subj) + (1|item), data_R3, family=binomial)
anova (model.null, model)
## Data: data_R3
## Models:
## model.null: DV ~ (1 | subj) + (1 | item)
## model: DV ~ cond + (1 | subj) + (1 | item)
## Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq)
## model.null 3 601.97 614.91 -297.98 595.97
## model 5 605.70 627.28 -297.85 595.70 0.2617 2 0.8773
summary (model)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: DV ~ cond + (1 | subj) + (1 | item)
## Data: data_R3
##
## AIC BIC logLik deviance df.resid
## 605.7 627.3 -297.9 595.7 548
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.3096 -0.6118 -0.4044 0.7636 3.1871
##
## Random effects:
## Groups Name Variance Std.Dev.
## subj (Intercept) 0.8362 0.9144
## item (Intercept) 0.0000 0.0000
## Number of obs: 553, groups: subj, 24; item, 24
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.3095 0.2647 -4.947 7.55e-07 ***
## condNeutral 0.1093 0.2539 0.431 0.667
## condPositive 0.1162 0.2515 0.462 0.644
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) cndNtr
## condNeutral -0.485
## condPositiv -0.488 0.507
Now look at Regressions Out of the post-critical region (Region 4).
data_R4 <- filter (FPRO, reg=="R4" & cond != "C0")
data_R4$cond <- recode (data_R4$cond, "C1"="Neutral", "C2"="Negative", "C3"="Positive")
data_R4$cond <- as.factor (data_R4$cond)
data_summ <- data_R4 %>% group_by(cond) %>% summarise(mean=mean(DV), sd=sd(DV))
Build a bar chart.
ggplot (data_summ, aes (x=cond, y=mean, fill=cond)) + geom_col() +
geom_text(aes (label= format(round(mean, 0))), nudge_y = 1) +
scale_fill_discrete (guide=FALSE) + labs (y="Regressions (%)", x="Condition",
title="Regressions Out of Post-Critical Region")
Recode DV as binomial.
data_R4$DV <- recode (data_R4$DV, "100"=1 ,"0"=0)
Build the glmms.
model.null <- glmer (DV ~ (1|subj) + (1|item), data_R4, family=binomial)
model <- glmer (DV ~ cond + (1|subj) + (1|item), data_R4, family=binomial)
anova (model.null, model)
## Data: data_R4
## Models:
## model.null: DV ~ (1 | subj) + (1 | item)
## model: DV ~ cond + (1 | subj) + (1 | item)
## Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq)
## model.null 3 261.67 274.28 -127.83 255.67
## model 5 259.10 280.12 -124.55 249.10 6.5681 2 0.03748 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary (model)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: DV ~ cond + (1 | subj) + (1 | item)
## Data: data_R4
##
## AIC BIC logLik deviance df.resid
## 259.1 280.1 -124.6 249.1 490
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.2733 -0.2756 -0.2033 -0.1468 5.2443
##
## Random effects:
## Groups Name Variance Std.Dev.
## subj (Intercept) 1.2797 1.1313
## item (Intercept) 0.1814 0.4259
## Number of obs: 495, groups: subj, 24; item, 24
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.5034 0.5094 -6.877 6.12e-12 ***
## condNeutral 0.2484 0.5076 0.489 0.6246
## condPositive 1.0495 0.4607 2.278 0.0227 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) cndNtr
## condNeutral -0.564
## condPositiv -0.660 0.605
As model is significant, run some pairwise comparisions to determine where the effect is. Ask for the output on the response scale.
emmeans (model, pairwise~cond, adjust="none", type="response")
## $emmeans
## cond prob SE df asymp.LCL asymp.UCL
## Negative 0.02921439 0.01444844 Inf 0.01096581 0.07551245
## Neutral 0.03714549 0.01697488 Inf 0.01498975 0.08908695
## Positive 0.07914715 0.02932761 Inf 0.03759123 0.15904989
##
## Confidence level used: 0.95
## Intervals are back-transformed from the logit scale
##
## $contrasts
## contrast odds.ratio SE df z.ratio p.value
## Negative / Neutral 0.7800601 0.3959661 Inf -0.489 0.6246
## Negative / Positive 0.3501293 0.1612980 Inf -2.278 0.0227
## Neutral / Positive 0.4488491 0.1940912 Inf -1.853 0.0640
##
## Tests are performed on the log odds ratio scale